a) Comparing raw and tamed data
library(readr)
library(dplyr)
library(fivethirtyeight)
# Raw data: year, month, day are separate variables
US_births_1994_2003_raw <- read_csv("https://raw.githubusercontent.com/fivethirtyeight/data/master/births/US_births_1994-2003_CDC_NCHS.csv")
head(US_births_1994_2003_raw)
## # A tibble: 6 x 5
## year month date_of_month day_of_week births
## <int> <int> <int> <int> <int>
## 1 1994 1 1 6 8096
## 2 1994 1 2 7 7772
## 3 1994 1 3 1 10142
## 4 1994 1 4 2 11248
## 5 1994 1 5 3 11053
## 6 1994 1 6 4 11406
# Tamed data: variable date of type "date" included
head(US_births_1994_2003)
## # A tibble: 6 x 6
## year month date_of_month date day_of_week births
## <int> <int> <int> <date> <ord> <int>
## 1 1994 1 1 1994-01-01 Sat 8096
## 2 1994 1 2 1994-01-02 Sun 7772
## 3 1994 1 3 1994-01-03 Mon 10142
## 4 1994 1 4 1994-01-04 Tues 11248
## 5 1994 1 5 1994-01-05 Wed 11053
## 6 1994 1 6 1994-01-06 Thurs 11406
b) Why should we care?
Without a variable of type date, making time series plots is difficult.
# Use filter command from dplyr package for data wrangling
US_births_1999 <- US_births_1994_2003 %>%
filter(year == 1999)
# Plot time series via base R:
plot(x = US_births_1999$date, y = US_births_1999$births, type = "l",
xlab = "Date", ylab = "Number of births", main = "1999 US Births")
